資料輸入與前處理
首先,我們讀入主要用來分析的資料集合。
# Required Library
library("tidyverse")## ─ Attaching packages ───────────────────── tidyverse 1.2.1 ─
## ✔ ggplot2 3.0.0 ✔ purrr 0.2.5
## ✔ tibble 1.4.2 ✔ dplyr 0.7.6
## ✔ tidyr 0.8.1 ✔ stringr 1.3.1
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## Warning: package 'dplyr' was built under R version 3.5.1
## ─ Conflicts ─────────────────────── tidyverse_conflicts() ─
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
# Please set your own working directory
setwd("~/Google 雲端硬碟/[Hahow] R 語言和商業分析課程/Part 2 - 資料科學核心方法/2-3 PCA/Case")
# Read Dataset
financial.data <- read_csv("2017_financial index_163 comp.csv")## Parsed with column specification:
## cols(
## comp_id = col_integer(),
## roe = col_double(),
## roa = col_double(),
## profit_margin_rate = col_double(),
## gross_margin_rate = col_double(),
## expense_rate = col_double(),
## asset_turnover = col_double(),
## inventory_turnover = col_double(),
## equity_turnnover = col_double(),
## rev_growth_rate = col_double(),
## margin_growth_rate = col_double(),
## op_profit_growth_rate = col_number(),
## cash_reinv_rate = col_double(),
## asset_growth_rate = col_double(),
## current_ratio = col_number(),
## quick_rartio = col_number(),
## debt_ratio = col_double()
## )
讓我們來看看「財務資料集合」資料集合,你會發現所有比率的資料都是以 (%) 方式呈現。由於我們會「標準化」後再進行 PCA,所以此時不需要將這些比率除以 100。
head(financial.data, 5)## # A tibble: 5 x 17
## comp_id roe roa profit_margin_rate gross_margin_rate expense_rate
## <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2303 3.06 2.21 4.4 18.1 14.8
## 2 2330 23.6 17.8 39.4 50.6 11.0
## 3 2337 25.7 14.3 16.8 37.0 20.1
## 4 2342 -3.41 -0.72 3.86 16.9 13
## 5 2344 10.9 7.69 14.0 34.3 20.3
## # ... with 11 more variables: asset_turnover <dbl>,
## # inventory_turnover <dbl>, equity_turnnover <dbl>,
## # rev_growth_rate <dbl>, margin_growth_rate <dbl>,
## # op_profit_growth_rate <dbl>, cash_reinv_rate <dbl>,
## # asset_growth_rate <dbl>, current_ratio <dbl>, quick_rartio <dbl>,
## # debt_ratio <dbl>
資料探索
讓我們先來看看每一個變數的敘述統計量。
summary(financial.data[, 2:ncol(financial.data)])## roe roa profit_margin_rate
## Min. :-245.2000 Min. :-132.5700 Min. :-252.05
## 1st Qu.: -0.2825 1st Qu.: -0.2225 1st Qu.: -0.50
## Median : 7.9800 Median : 5.8200 Median : 7.72
## Mean : 4.3588 Mean : 4.1785 Mean : 3.46
## 3rd Qu.: 16.5400 3rd Qu.: 11.5100 3rd Qu.: 16.13
## Max. : 37.0100 Max. : 28.9300 Max. : 51.67
## gross_margin_rate expense_rate asset_turnover inventory_turnover
## Min. :-27.60 Min. : 1.78 Min. :0.030 Min. : 0.000
## 1st Qu.: 18.39 1st Qu.: 10.78 1st Qu.:0.530 1st Qu.: 3.092
## Median : 29.91 Median : 19.43 Median :0.720 Median : 4.370
## Mean : 31.19 Mean : 27.96 Mean :0.878 Mean : 6.227
## 3rd Qu.: 41.48 3rd Qu.: 34.67 3rd Qu.:1.048 3rd Qu.: 6.475
## Max. :100.00 Max. :255.18 Max. :5.110 Max. :56.810
## equity_turnnover rev_growth_rate margin_growth_rate
## Min. : 0.030 Min. :-41.470 Min. :-328.530
## 1st Qu.: 0.745 1st Qu.: -4.827 1st Qu.: -9.572
## Median : 1.035 Median : 5.745 Median : 8.200
## Mean : 1.447 Mean : 12.815 Mean : 24.434
## 3rd Qu.: 1.665 3rd Qu.: 17.852 3rd Qu.: 27.150
## Max. :10.080 Max. :573.570 Max. : 722.800
## op_profit_growth_rate cash_reinv_rate asset_growth_rate
## Min. :-449.33 Min. :-62.240 Min. :-52.860
## 1st Qu.: -15.50 1st Qu.: -3.237 1st Qu.: -3.560
## Median : 15.76 Median : 2.620 Median : 4.240
## Mean : 36.69 Mean : 1.731 Mean : 7.537
## 3rd Qu.: 67.62 3rd Qu.: 8.582 3rd Qu.: 13.447
## Max. :1708.73 Max. : 35.170 Max. :161.800
## current_ratio quick_rartio debt_ratio
## Min. : 50.35 Min. : 19.12 Min. : 0.97
## 1st Qu.: 187.37 1st Qu.: 130.28 1st Qu.:17.72
## Median : 297.32 Median : 206.93 Median :26.68
## Mean : 394.05 Mean : 300.61 Mean :30.27
## 3rd Qu.: 454.46 3rd Qu.: 328.61 3rd Qu.:43.47
## Max. :3783.42 Max. :3748.99 Max. :94.24
為了瞭解變數間比次的相關程度,我們會利用 cor 函數計算變數彼此間的相關係數。
cor(financial.data[, 2:ncol(financial.data)])## roe roa profit_margin_rate
## roe 1.000000e+00 0.927613723 0.769836624
## roa 9.276137e-01 1.000000000 0.849678033
## profit_margin_rate 7.698366e-01 0.849678033 1.000000000
## gross_margin_rate 2.639333e-01 0.316508192 0.276079736
## expense_rate -5.639643e-01 -0.602167492 -0.768924244
## asset_turnover 1.479487e-01 0.166463783 0.116560159
## inventory_turnover 6.099483e-02 0.049871996 0.096369338
## equity_turnnover -8.465143e-02 0.038691090 0.030993761
## rev_growth_rate 1.788903e-01 0.182344013 0.112338020
## margin_growth_rate 4.237200e-02 0.025256508 0.008797824
## op_profit_growth_rate 1.632351e-01 0.146719332 0.138618099
## cash_reinv_rate 5.184223e-01 0.504324078 0.477952503
## asset_growth_rate 3.667274e-01 0.388442882 0.300383311
## current_ratio 4.332876e-05 -0.006202091 -0.077164016
## quick_rartio 5.196865e-02 0.046996374 -0.015659771
## debt_ratio -1.258534e-01 -0.053587292 0.034495869
## gross_margin_rate expense_rate asset_turnover
## roe 0.26393332 -0.56396427 0.14794874
## roa 0.31650819 -0.60216749 0.16646378
## profit_margin_rate 0.27607974 -0.76892424 0.11656016
## gross_margin_rate 1.00000000 0.39128488 -0.27028252
## expense_rate 0.39128488 1.00000000 -0.29526650
## asset_turnover -0.27028252 -0.29526650 1.00000000
## inventory_turnover -0.31354281 -0.26958440 0.29470755
## equity_turnnover -0.33231257 -0.24999003 0.81529132
## rev_growth_rate -0.04901623 -0.14229687 0.64183724
## margin_growth_rate -0.03352279 -0.02418444 0.09165550
## op_profit_growth_rate 0.12722264 -0.04348225 0.10178782
## cash_reinv_rate 0.13386636 -0.36882576 -0.02304687
## asset_growth_rate 0.19833667 -0.15952707 0.24830284
## current_ratio 0.47049269 0.37547646 -0.20018000
## quick_rartio 0.49806646 0.33611349 -0.23187752
## debt_ratio -0.43050696 -0.30251126 0.25122647
## inventory_turnover equity_turnnover rev_growth_rate
## roe 0.060994826 -0.08465143 0.178890323
## roa 0.049871996 0.03869109 0.182344013
## profit_margin_rate 0.096369338 0.03099376 0.112338020
## gross_margin_rate -0.313542810 -0.33231257 -0.049016229
## expense_rate -0.269584397 -0.24999003 -0.142296865
## asset_turnover 0.294707548 0.81529132 0.641837236
## inventory_turnover 1.000000000 0.22341445 0.238834411
## equity_turnnover 0.223414455 1.00000000 0.352674052
## rev_growth_rate 0.238834411 0.35267405 1.000000000
## margin_growth_rate -0.009975919 0.03092636 0.348369257
## op_profit_growth_rate -0.070522242 0.04649760 0.277928273
## cash_reinv_rate 0.059059882 -0.02463604 -0.019240757
## asset_growth_rate 0.136004623 0.14819216 0.354016268
## current_ratio -0.159251676 -0.29358895 0.032301834
## quick_rartio -0.120757063 -0.29589570 0.008888558
## debt_ratio 0.193927223 0.60197288 -0.006813672
## margin_growth_rate op_profit_growth_rate
## roe 0.042371996 1.632351e-01
## roa 0.025256508 1.467193e-01
## profit_margin_rate 0.008797824 1.386181e-01
## gross_margin_rate -0.033522794 1.272226e-01
## expense_rate -0.024184436 -4.348225e-02
## asset_turnover 0.091655503 1.017878e-01
## inventory_turnover -0.009975919 -7.052224e-02
## equity_turnnover 0.030926357 4.649760e-02
## rev_growth_rate 0.348369257 2.779283e-01
## margin_growth_rate 1.000000000 3.771601e-01
## op_profit_growth_rate 0.377160119 1.000000e+00
## cash_reinv_rate 0.048577168 1.434912e-01
## asset_growth_rate 0.087707834 2.491875e-01
## current_ratio 0.001143093 -2.123743e-02
## quick_rartio -0.019162942 -9.133092e-03
## debt_ratio -0.053638320 -4.886632e-05
## cash_reinv_rate asset_growth_rate current_ratio
## roe 0.51842229 0.36672740 4.332876e-05
## roa 0.50432408 0.38844288 -6.202091e-03
## profit_margin_rate 0.47795250 0.30038331 -7.716402e-02
## gross_margin_rate 0.13386636 0.19833667 4.704927e-01
## expense_rate -0.36882576 -0.15952707 3.754765e-01
## asset_turnover -0.02304687 0.24830284 -2.001800e-01
## inventory_turnover 0.05905988 0.13600462 -1.592517e-01
## equity_turnnover -0.02463604 0.14819216 -2.935890e-01
## rev_growth_rate -0.01924076 0.35401627 3.230183e-02
## margin_growth_rate 0.04857717 0.08770783 1.143093e-03
## op_profit_growth_rate 0.14349123 0.24918754 -2.123743e-02
## cash_reinv_rate 1.00000000 0.05643202 -2.392760e-02
## asset_growth_rate 0.05643202 1.00000000 4.641141e-02
## current_ratio -0.02392760 0.04641141 1.000000e+00
## quick_rartio 0.03713851 0.06604236 9.753530e-01
## debt_ratio 0.01151498 0.10378057 -5.861077e-01
## quick_rartio debt_ratio
## roe 0.051968653 -1.258534e-01
## roa 0.046996374 -5.358729e-02
## profit_margin_rate -0.015659771 3.449587e-02
## gross_margin_rate 0.498066464 -4.305070e-01
## expense_rate 0.336113493 -3.025113e-01
## asset_turnover -0.231877517 2.512265e-01
## inventory_turnover -0.120757063 1.939272e-01
## equity_turnnover -0.295895701 6.019729e-01
## rev_growth_rate 0.008888558 -6.813672e-03
## margin_growth_rate -0.019162942 -5.363832e-02
## op_profit_growth_rate -0.009133092 -4.886632e-05
## cash_reinv_rate 0.037138507 1.151498e-02
## asset_growth_rate 0.066042364 1.037806e-01
## current_ratio 0.975353040 -5.861077e-01
## quick_rartio 1.000000000 -5.246527e-01
## debt_ratio -0.524652674 1.000000e+00
我通常喜歡使用熱圖 (heatmap) 視覺化變數間的相關程度,如果要使用 ggplot2 繪製相關係數的熱圖,必須先將資料整理成 tidy 的「變數 1 - 變數 2 - 相關係數」資料架構。我們可以利用 reshape2套件中的melt函數輕鬆把矩陣格式轉換成 tidy 資料。
library(reshape2)##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
head(melt(cor(financial.data[, 2:ncol(financial.data)])), 5)## Var1 Var2 value
## 1 roe roe 1.0000000
## 2 roa roe 0.9276137
## 3 profit_margin_rate roe 0.7698366
## 4 gross_margin_rate roe 0.2639333
## 5 expense_rate roe -0.5639643
接著我們就可以用geom_tile(繪製方形圖)繪製相關係數的熱圖囉!可以看到很清楚的變數群聚現象:
- 盈利能力變數彼此間高度相關
- 營業費用率顯然與盈利能力負相關
- 你還發現哪些高度正/負相關的比率?如何詮釋?
ggplot(melt(cor(financial.data[, 2:ncol(financial.data)])),
aes(Var1, Var2)) +
geom_tile(aes(fill = value), colour = "white") +
scale_fill_gradient2(low = "firebrick4", high = "steelblue",
mid = "white", midpoint = 0) +
guides(fill=guide_legend(title="Correlation")) +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
axis.title = element_blank())資料建模與分析
首先,我們要建立 PCA 模型,可以利用 R 語言中的 prcomp 函數。其中,如果你希望輸入的資料矩陣先標準化在做參數估計,可以設定 scales = T。
pca.model <- prcomp(financial.data[, 2:ncol(financial.data)], scale = T)
names(pca.model)## [1] "sdev" "rotation" "center" "scale" "x"
可以透過 summary 函數看到主成份分析每個主成份的解釋變異程度,可以發現只要六個主成分就能夠解釋 80% 的資料變異。
summary(pca.model)## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 2.0330 1.8739 1.4399 1.16215 1.01323 0.91896
## Proportion of Variance 0.2583 0.2195 0.1296 0.08441 0.06417 0.05278
## Cumulative Proportion 0.2583 0.4778 0.6074 0.69177 0.75594 0.80872
## PC7 PC8 PC9 PC10 PC11 PC12
## Standard deviation 0.87186 0.77645 0.73927 0.67853 0.56103 0.50353
## Proportion of Variance 0.04751 0.03768 0.03416 0.02878 0.01967 0.01585
## Cumulative Proportion 0.85623 0.89391 0.92806 0.95684 0.97651 0.99236
## PC13 PC14 PC15 PC16
## Standard deviation 0.27912 0.15178 0.12692 0.07223
## Proportion of Variance 0.00487 0.00144 0.00101 0.00033
## Cumulative Proportion 0.99723 0.99867 0.99967 1.00000
我們也可以透過解釋變異 / 累積解釋比率圖來選取主成份: - var:該主成份解釋變異數的數值 - prop:該主成份解釋變異數的比率 = PC 變異數 / 總變異 - cum_prop:該主成份解釋變異數的累積比率
由於 pca.model 只能夠抓出每一個 PC 的標準差 pca.model$sdev,所以我們需要先建立以下表格,計算出上述各項數值。
var.exp <- tibble(
pc = paste0("PC_", formatC(1:16, width=2, flag="0")),
var = pca.model$sdev^2,
prop = (pca.model$sdev)^2 / sum((pca.model$sdev)^2),
cum_prop = cumsum((pca.model$sdev)^2 / sum((pca.model$sdev)^2)))
head(var.exp, 5)## # A tibble: 5 x 4
## pc var prop cum_prop
## <chr> <dbl> <dbl> <dbl>
## 1 PC_01 4.13 0.258 0.258
## 2 PC_02 3.51 0.219 0.478
## 3 PC_03 2.07 0.130 0.607
## 4 PC_04 1.35 0.0844 0.692
## 5 PC_05 1.03 0.0642 0.756
接著,我們可以利用 plotly 套件建立解釋變異的條狀圖。
library(plotly)##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
plot_ly(
x = var.exp$pc,
y = var.exp$var,
type = "bar"
) %>%
layout(
title = "Variance Explained by Each Principal Component",
xaxis = list(type = 'Principal Component', tickangle = -60),
yaxis = list(title = 'Variance'),
margin = list(r = 30, t = 50, b = 70, l = 50)
)plot_ly(
x = var.exp$pc,
y = var.exp$cum_prop,
type = "bar"
) %>%
layout(
title = "Cumulative Proportion by Each Principal Component",
xaxis = list(type = 'Principal Component', tickangle = -60),
yaxis = list(title = 'Proportion'),
margin = list(r = 30, t = 50, b = 70, l = 50)
)從上述分析,建議取前六個主成份即可。接下來,讓我們更仔細的了解每一個主成份的係數,以便解釋主成份內容。如果想要找出主成份的係數矩陣,可以透過 pca.model$rotation。
head(pca.model$rotation, 5)## PC1 PC2 PC3 PC4
## roe 0.35051324 -0.30782632 0.09992653 -0.01265022
## roa 0.37209821 -0.29926633 0.09449066 -0.04995561
## profit_margin_rate 0.37420084 -0.25303809 0.17092884 -0.03880183
## gross_margin_rate -0.07651638 -0.40212860 -0.08460058 0.03060840
## expense_rate -0.40631496 -0.01939565 -0.21713947 0.05973301
## PC5 PC6 PC7 PC8
## roe -0.01349820 0.03456689 -0.15906558 0.07416622
## roa 0.08224210 -0.01771183 -0.12638004 -0.00498288
## profit_margin_rate 0.02278657 -0.01866967 -0.06950275 -0.16719939
## gross_margin_rate 0.40812461 0.01386915 0.01697671 0.32211008
## expense_rate 0.23611021 0.04571298 0.09960746 0.38295533
## PC9 PC10 PC11 PC12
## roe -0.009130585 0.09361023 -0.2382325 0.5499299
## roa -0.056233502 -0.08161125 -0.1409867 0.3340492
## profit_margin_rate 0.052479810 -0.34314001 0.2211807 -0.3876528
## gross_margin_rate -0.272326633 -0.47017407 0.1271494 -0.1790633
## expense_rate -0.239559563 -0.01790162 -0.1261054 0.2792875
## PC13 PC14 PC15 PC16
## roe -0.37672698 -0.457679257 -0.13920077 -0.03555639
## roa 0.58984636 0.480985247 0.11922953 0.05209979
## profit_margin_rate -0.12171515 0.004062131 -0.05631017 -0.62759258
## gross_margin_rate -0.13036971 -0.066220940 -0.01451809 0.43415917
## expense_rate 0.04837154 0.096936721 -0.02701126 -0.63664808
在這裡,我們使用跟共變異數矩陣一樣的視覺化方法。
ggplot(melt(pca.model$rotation[, 1:6]), aes(Var2, Var1)) +
geom_tile(aes(fill = value), colour = "white") +
scale_fill_gradient2(low = "firebrick4", high = "steelblue",
mid = "white", midpoint = 0) +
guides(fill=guide_legend(title="Coefficient")) +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
axis.title = element_blank())非負稀疏主成份分析
我們發現上面的主成份其實很難解釋,所以改採用非負稀疏主成份分析,可以利用 nsprcomp 套件中的 nscumcomp 完成,其中有兩個重要的參數:
k:非 0 係數個數,通常是「每個主成份期待非 0 係數個數」x 變數個數nneg:是否希望所有係數都非負,TRUE代表有非負限制
set.seed(1234)
library(nsprcomp)
nspca.model <- nscumcomp(
financial.data[, 2:17],
k = 90, nneg = T,
scale. = T)var.exp <- tibble(
pc = paste0("PC_", formatC(1:16, width=2, flag="0")),
var = nspca.model$sdev^2,
prop = (nspca.model$sdev)^2 / sum((nspca.model$sdev)^2),
cum_prop = cumsum((nspca.model$sdev)^2 / sum((nspca.model$sdev)^2)))
head(var.exp, 5)## # A tibble: 5 x 4
## pc var prop cum_prop
## <chr> <dbl> <dbl> <dbl>
## 1 PC_01 1.44 0.158 0.158
## 2 PC_02 1.19 0.132 0.290
## 3 PC_03 1.07 0.118 0.408
## 4 PC_04 0.906 0.0999 0.508
## 5 PC_05 0.888 0.0979 0.606
同樣,我們可以透過解釋變異與累積比率決定主成份個數,此處可以採用 8 個主成份。
library(plotly)
plot_ly(
x = var.exp$pc,
y = var.exp$var,
type = "bar"
) %>%
layout(
title = "Variance Explained by Each Principal Component",
xaxis = list(type = 'Principal Component', tickangle = -60),
yaxis = list(title = 'Variance'),
margin = list(r = 30, t = 50, b = 70, l = 50)
)plot_ly(
x = var.exp$pc,
y = var.exp$cum_prop,
type = "bar"
) %>%
layout(
title = "Cumulative Proportion by Each Principal Component",
xaxis = list(type = 'Principal Component', tickangle = -60),
yaxis = list(title = 'Proportion'),
margin = list(r = 30, t = 50, b = 70, l = 50)
)接下來,讓我們看看非負稀疏主成份的係數權重。從熱圖中可以看到:
- 主成份 1 重點為「股東權益獲利與成長能力」
- 主成份 2 重點為「資產獲利能力」
- 主成份 3 重點為「毛利與週轉率」
- … 你能不能夠幫忙其他主成份命名呢?
ggplot(melt(nspca.model$rotation[, 1:8]), aes(Var2, Var1)) +
geom_tile(aes(fill = value), colour = "white") +
scale_fill_gradient2(low = "white", high = "steelblue") +
guides(fill=guide_legend(title="Coefficient")) +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
axis.title = element_blank())公司個別分析
在這裡教大家一個找出特別公司的方法,就是繪製「主成份分數」與「該主成份係數最大變數」的散佈圖。比如說下圖中,可以找出幾種特別怪異的公司:
- 給定 ROE,PC 1 特別卓越:6684安格
- 給定 PC1,ROE 超級低:6291沛亨
nspca.score <- data.frame(nspca.model$x)
row.names(nspca.score) <- financial.data$comp_id
plot_ly(
x = nspca.score[, 1],
y = financial.data$roe,
text = financial.data$comp_id,
type = "scatter",
mode = "markers"
) %>% layout(
title = "ROE v.s. PC 1 Score: Scatter Plot",
xaxis = list(title = 'Principal Component 1'),
yaxis = list(title = 'Return on Equity'),
margin = list(r = 30, t = 50, b = 70, l = 50)
)另外,透過不同主成份的散佈圖,也可以找到再多種面向都很傑出的公司:
- 3529力旺、6643丹星、6684安格在「資產獲利能力」與「毛利與週轉率」都特別傑出,值得關注。
plot_ly(
x = nspca.score[, 2],
y = nspca.score[, 3],
text = financial.data$comp_id,
type = "scatter",
mode = "markers"
) %>% layout(
title = "PC 2 v.s. PC 3 Score: Scatter Plot",
xaxis = list(title = 'Principal Component 2'),
yaxis = list(title = 'Principal Component 3'),
margin = list(r = 30, t = 50, b = 70, l = 50)
)